from pyspark.context import SparkContext


sc = SparkContext()

lines_rdd = sc.textFile("../../data/students.txt")

print("执行在Driver端")

# 累加器,不需要启动新的任务就可以进行全局累加
# 1、在Driver端定义累加器
count_acc = sc.accumulator(0)


def map_fun(line):
    print("执行在Executor端")

    # 2、在Executor进行累加
    count_acc.add(1)

    clazz = line.split(",")[-1]
    return clazz, 1

kv_rdd = lines_rdd.map(map_fun)

print("执行在Driver端")

kv_rdd.foreach(print)

# 3、在Driver端获取累加结果
print(count_acc.value)

# count算子也能实现统计数量，会产生新的job
count = lines_rdd.count()
print(count)
